Country level analysis¶
Load libraries¶
import warnings
from functools import partial
import covid_analysis.utils.paths as path
import janitor
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_flavor as pf
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
from plotly.offline import init_notebook_mode
Set defaults for plots¶
# matplotlib
plt.style.use("seaborn-whitegrid")
plt.rcParams["figure.figsize"] = (10, 8)
# seaborn
sns.set_style("whitegrid")
# plotly
init_notebook_mode()
pio.templates.default = "plotly_white"
pd.options.plotting.backend = "plotly"
# Some plot warninigs
warnings.filterwarnings("ignore")
Utility functions¶
Define input directory¶
input_dir = path.data_processed_dir()
Load data¶
Confirmed and deaths time series¶
hopkins_tidy_cumulative_df = (
pd.read_csv(
filepath_or_buffer=input_dir.joinpath("hopkins_tidy_cumulative.csv")
)
.transform_column("date", pd.to_datetime)
)
hopkins_tidy_cumulative_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118755 entries, 0 to 118754
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 country 118755 non-null object
1 date 118755 non-null datetime64[ns]
2 confirmed 118755 non-null int64
3 deaths 118755 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 3.6+ MB
Vaccination time series¶
vaccination_tidy_cumulative_df = (
pd.read_csv(
filepath_or_buffer=input_dir.joinpath("vaccination_country_cumulative.csv")
)
)
vaccination_tidy_cumulative_df.head(1)
| country | date | doses_admin | people_partially_vaccinated | people_fully_vaccinated | |
|---|---|---|---|---|---|
| 0 | Afghanistan | 2021-02-22 | 0 | 0.0 | 0.0 |
Countries population metadata¶
countries_population_df = pd.read_csv(
filepath_or_buffer=input_dir.joinpath("countries_population.csv")
)
countries_population_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 country 196 non-null object
1 population 196 non-null float64
dtypes: float64(1), object(1)
memory usage: 3.2+ KB
Pandemic behavior to date¶
Spread progression¶
fig = (
hopkins_tidy_cumulative_df
.groupby("country")
.resample("4D", on="date")
.first()
.reset_index(drop=True)
.assign(
date=lambda df: df.date.dt.strftime("%Y-%m-%d")
)
.pipe(
lambda df: px.choropleth(
df,
locations="country",
locationmode="country names",
color="confirmed",
animation_frame="date",
color_continuous_scale='Plasma',
hover_name="country",
hover_data=dict(
country=False,
),
labels=dict(
country="Country",
confirmed="Confirmed cases",
date="Date"
)
)
)
.update_geos(
fitbounds="locations",
visible=False
)
.update_layout(
margin={
"r": 0,
"t": 0,
"l": 0,
"b": 0
}
)
)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1e-10
fig.show()
Death progression¶
fig = (
hopkins_tidy_cumulative_df
.groupby("country")
.resample("4D", on="date")
.first()
.reset_index(drop=True)
.assign(
date=lambda df: df.date.dt.strftime("%Y-%m-%d")
)
.pipe(
lambda df: px.choropleth(
df,
locations="country",
locationmode="country names",
color="deaths",
animation_frame="date",
color_continuous_scale='Plasma',
hover_name="country",
labels=dict(
country="Country",
deaths="Deaths",
date="Date"
)
)
)
.update_geos(
fitbounds="locations",
visible=False
)
.update_layout(
margin={
"r": 0,
"t": 0,
"l": 0,
"b": 0
}
)
)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1e-10
fig.show()
Affected countries over time¶
(
hopkins_tidy_cumulative_df
.remove_columns("deaths")
.filter_on("confirmed != 0")
.groupby("country")
.head(1)
.groupby("date")
.count()
.reset_index()
.remove_columns("confirmed")
.assign(
n_affected_countries=lambda df: df.country.cumsum(),
percentage_affected_countries=lambda df: df.n_affected_countries / df.n_affected_countries.max() * 100
)
.pipe(
lambda df: (
px.area(
df,
x="date",
y="percentage_affected_countries",
labels=dict(
date="Date",
percentage_affected_countries="Percentage of affected countries"
)
)
.update_layout(
yaxis=dict(ticksuffix="%")
)
)
)
)
Mortality Analysis¶
Create dataset¶
worldwide_mortality_df = (
hopkins_tidy_cumulative_df
.merge(
countries_population_df,
on="country"
)
# .filter_on("population != 0 and population > 1e3")
# .filter_on("confirmed > 1e3")
.groupby("country")
.last()
.reset_index()
.assign(
fatality_rate=lambda df: df.deaths / df.confirmed * 100,
mortality_rate=lambda df: df.deaths / df.population * 100
)
.select_columns(["country", "confirmed", "deaths", "population", "*_rate"])
)
Identify most affected countries¶
top_10_death_countries = (
worldwide_mortality_df
.sort_values("deaths", ascending=False)
.head(10)
.country
)
Cumulative behavior of the countries¶
interest_countries_cumulative_df = (
hopkins_tidy_cumulative_df
.query("country in @top_10_death_countries")
)
(
interest_countries_cumulative_df
.pivot_wider(
"country",
"date",
"deaths"
)
.set_index("country")
.loc[top_10_death_countries]
.pipe(
lambda df: (
px.imshow(
df,
labels=dict(
x="Date",
y="",
color="Number of deaths"
)
)
.update_layout(
margin={
"r": 0,
"t": 0,
"l": 0,
"b": 0
}
)
)
)
)
Most affected countries by fatality rate¶
interest_countries_fatality_df = (
worldwide_mortality_df
.query("country in @top_10_death_countries")
)
(
interest_countries_fatality_df
.assign(
fatality_rate=lambda df: df.fatality_rate.round(2)
)
.sort_values("fatality_rate")
.pipe(
lambda df: (
px.bar(
df,
x="fatality_rate",
y="country",
text="fatality_rate",
labels=dict(
fatality_rate="Fatality rate",
country=""
),
hover_name="country",
hover_data=dict(
country=False,
fatality_rate=False
)
)
.update_traces(
texttemplate="%{text}%"
)
)
)
)
(
worldwide_mortality_df
.assign(
fatality_rate=lambda df: df.fatality_rate.round(2),
most_affected_by_death=lambda df: (
df.country.isin(top_10_death_countries)
)
)
.pipe(
lambda df: (
px.scatter(
df,
x="confirmed",
y="deaths",
color="most_affected_by_death",
labels=dict(
confirmed="Confirmed cases",
deaths="Deaths",
fatality_rate="Fatality rate"
),
hover_name="country",
hover_data=dict(
fatality_rate=True,
confirmed=False,
deaths=False,
most_affected_by_death=False
),
log_x=True,
log_y=True,
)
.add_scatter(
x=df.confirmed,
y=df.confirmed * 0.005,
showlegend=False,
opacity=0.2,
line=dict(color="gray"),
hovertemplate="<extra>0.5%</extra>"
)
.add_scatter(
x=df.confirmed,
y=df.confirmed * 0.010,
showlegend=False,
opacity=0.2,
line=dict(color="gray"),
hovertemplate="<extra>1%</extra>"
)
.add_scatter(
x=df.confirmed,
y=df.confirmed * 0.020,
showlegend=False,
opacity=0.2,
line=dict(color="gray"),
hovertemplate="<extra>2%</extra>"
)
.add_scatter(
x=df.confirmed,
y=df.confirmed * 0.05,
showlegend=False,
opacity=0.2,
line=dict(color="gray"),
hovertemplate="<extra>5%</extra>"
)
.add_scatter(
x=df.confirmed,
y=df.confirmed * 0.1,
showlegend=False,
opacity=0.2,
line=dict(color="gray"),
hovertemplate="<extra>10%</extra>"
)
.update_layout(
title="Fatality rate",
showlegend=False
)
)
)
)
Trajectories per country¶
Create dataset¶
trajectories_countries_df = (
interest_countries_cumulative_df
.pipe(
lambda df: (
df
.merge(
(
df
.groupby("country")
.apply(
lambda sub_df: (
sub_df
.set_index("date")
.diff()
.rolling("7D")
.mean()
)
)
.reset_index()
),
on=["country", "date"],
suffixes=("_total", "_rolling")
)
)
)
.assign(
date=lambda df: df.date.dt.strftime("%Y-%m-%d")
)
.dropna()
)
trajectories_countries_df.head(1)
| country | date | confirmed_total | deaths_total | confirmed_rolling | deaths_rolling | |
|---|---|---|---|---|---|---|
| 1 | Brazil | 2020-01-23 | 0 | 0 | 0.0 | 0.0 |
Confirmed cases¶
(
trajectories_countries_df
.pipe(
lambda df: (
px.line(
df,
x="confirmed_total",
y="confirmed_rolling",
color="country",
labels=dict(
confirmed_total="Total confirmed cases",
confirmed_rolling="Confirmed new cases",
country="Country",
date="Date"
),
hover_name="country",
hover_data=dict(
date=True,
confirmed_total=True,
confirmed_rolling=True,
country=False,
)
)
.update_layout(
title="Confirmed COVID-19 trajectories (7-day moving average)",
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.3
),
margin={
"r": 0.7,
"l": 0,
}
)
)
)
)
Deaths¶
Linear scale¶
(
trajectories_countries_df
.pipe(
lambda df: (
px.line(
df,
x="deaths_total",
y="deaths_rolling",
color="country",
labels=dict(
deaths_total="Total deaths",
deaths_rolling="New deaths",
country="Country",
date="Date"
),
hover_name="country",
hover_data=dict(
date=True,
deaths_total=True,
deaths_rolling=True,
country=False,
)
)
.update_layout(
title="Deaths COVID-19 trajectories (7-day moving average)",
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.3
),
margin={
"r": 0.7,
"l": 0,
}
)
)
)
)
Logarithmic scale (We are all in this together!)¶
(
trajectories_countries_df
.pipe(
lambda df: (
px.line(
df,
x="deaths_total",
y="deaths_rolling",
color="country",
labels=dict(
deaths_total="Total deaths",
deaths_rolling="New deaths",
country="Country",
date="Date"
),
hover_name="country",
hover_data=dict(
date=True,
deaths_total=True,
deaths_rolling=True,
country=False,
),
log_x=True,
log_y=True
)
.update_layout(
title="Deaths COVID-19 trajectories (7-day moving average)",
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.3
),
margin={
"r": 0.7,
"l": 0,
}
)
)
)
)
Vaccination per each country¶
(
vaccination_tidy_cumulative_df
.sort_values(["country", "date"])
.groupby(["country"])
.tail(1)
.assign(
percentage_accumulated_vaccines=lambda df: (df.doses_admin / df.doses_admin.sum() * 100).round(2)
)
.pipe(
lambda df: (
px.treemap(
df,
path=[px.Constant("World"), "country"],
values="doses_admin",
hover_name="country",
labels=dict(
country="Country",
percentage_accumulated_vaccines="Percentage of accumulated vaccines",
doses_admin="Administered doses",
people_partially_vaccinated="People partially vaccinated",
people_fully_vaccinated="People fully vaccinated"
),
hover_data=dict(
country=False,
percentage_accumulated_vaccines=True,
people_partially_vaccinated=True,
people_fully_vaccinated=True
)
)
)
)
)